import re

from scrapy import log
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider

from dragonspider.items import Topic, Message


class DragonSpider(BaseSpider):
    domain_name = "forums.dragonmount.com"
    allowed_domains = ["forums.dragonmount.com"]
    start_urls = [
        "http://forums.dragonmount.com/index.php/board,1.0.html"
#        "http://forums.dragonmount.com/index.php?topic=60716.0"
    ]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        
        # In a board (list of threads)
        if 'board' in response.url:
            topics = hxs.select('//a[contains(@href,"index.php?topic") and contains(@href,".0")]')
            for topic in topics:
                url = topic.select('@href').extract()[0]
                if url.endswith('#new'):
                    continue
                title = topic.select('text()').extract()[0]
                if title == '1':
                    continue
                topic_id = self._topic_id_from_url(url)
                yield Topic(id=topic_id, title=title, url=url)
                yield Request(url, callback=self.parse)
        
        # In a topic / thread
        if 'topic' in response.url:
            topic_id = self._topic_id_from_url(response.url)
            messages = hxs.select('//table[@cellpadding="5"]')
            for msg in messages:
                msg_url = msg.select('.//a[contains(@href, ".msg")]/@href').extract()[0]
                msg_id = self._message_id_from_url(msg_url)
                author = msg.select('.//a[contains(@href, "profile")]/text()').extract()[0]
                when = msg.select('.//div[contains(., " on:")]/text()').extract()[-1]
                when = when[:-1].strip()
                text = '\n\n'.join(msg.select('.//div[@class="post"]/text()').extract())
                yield Message(id=msg_id, topic=topic_id, author=author, datetime=when, text=text)

        other_pages = set(hxs.select('//a[@class="navPages"]/@href').extract())
#        log.msg("Crawling: %s\nNav links:\n%s" % (response.url, '\n'.join(other_pages)),
#                log.WARNING)
        for url in other_pages:
            # For now, only load the first page threads
            if 'topic' in response.url:
                yield Request(url, callback=self.parse)
    
    def _topic_id_from_url(self, url):
        return re.search('topic[,=](\d+)\.', url).groups()[0]
    
    def _message_id_from_url(self, url):
        return re.search('\.msg(\d+)((\.html)|#)', url).groups()[0]


SPIDER = DragonSpider()
